{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MAXP 2021初赛数据探索和处理-3\n",
    "\n",
    "使用步骤1里处理好的节点的ID，来构建DGL的graph所需要的边列表。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using backend: pytorch\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "\n",
    "import dgl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# path\n",
    "base_path = '/Users/jamezhan/PycharmProjects/MAXP/final_dataset'\n",
    "publish_path = 'publish'\n",
    "\n",
    "link_p1_path = os.path.join(base_path, publish_path, 'link_phase1.csv')\n",
    "nodes_path = os.path.join(base_path, publish_path, 'IDandLabels.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读取节点列表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(3655452, 4)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>node_idx</th>\n",
       "      <th>paper_id</th>\n",
       "      <th>Label</th>\n",
       "      <th>Split_ID</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3655448</th>\n",
       "      <td>3655448</td>\n",
       "      <td>caed47d55d1e193ecb1fa97a415c13dd</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3655449</th>\n",
       "      <td>3655449</td>\n",
       "      <td>c82eb6be79a245392fb626b9a7e1f246</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3655450</th>\n",
       "      <td>3655450</td>\n",
       "      <td>926a31f6b378575204aae30b5dfa6dd3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3655451</th>\n",
       "      <td>3655451</td>\n",
       "      <td>bbace2419c3f827158ea4602f3eb35fa</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         node_idx                          paper_id Label  Split_ID\n",
       "3655448   3655448  caed47d55d1e193ecb1fa97a415c13dd   NaN         1\n",
       "3655449   3655449  c82eb6be79a245392fb626b9a7e1f246   NaN         1\n",
       "3655450   3655450  926a31f6b378575204aae30b5dfa6dd3   NaN         1\n",
       "3655451   3655451  bbace2419c3f827158ea4602f3eb35fa   NaN         1"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nodes_df = pd.read_csv(nodes_path, dtype={'Label':str})\n",
    "print(nodes_df.shape)\n",
    "nodes_df.tail(4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 读取边列表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(29168650, 3)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>paper_id</th>\n",
       "      <th>reference_paper_id</th>\n",
       "      <th>phase</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>f10da75ad1eaf16eb2ffe0d85b76b332</td>\n",
       "      <td>711ef25bdb2c2421c0131af77b3ede1d</td>\n",
       "      <td>phase1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9ac5a4327bd4f3dcb424c93ca9b84087</td>\n",
       "      <td>2d91c73304c5e8a94a0e5b4956093f71</td>\n",
       "      <td>phase1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9d91bfd4703e55dd814dfffb3d63fc33</td>\n",
       "      <td>33d4fdfe3967a1ffde9311bfe6827ef9</td>\n",
       "      <td>phase1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>e1bdbce05528952ed6579795373782d4</td>\n",
       "      <td>4bda690abec912b3b7b228b01fb6819a</td>\n",
       "      <td>phase1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>eb623ac4b10df96835921edabbde2951</td>\n",
       "      <td>c1a05bdfc88a73bf2830e705b2f39dbb</td>\n",
       "      <td>phase1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           paper_id                reference_paper_id   phase\n",
       "0  f10da75ad1eaf16eb2ffe0d85b76b332  711ef25bdb2c2421c0131af77b3ede1d  phase1\n",
       "1  9ac5a4327bd4f3dcb424c93ca9b84087  2d91c73304c5e8a94a0e5b4956093f71  phase1\n",
       "2  9d91bfd4703e55dd814dfffb3d63fc33  33d4fdfe3967a1ffde9311bfe6827ef9  phase1\n",
       "3  e1bdbce05528952ed6579795373782d4  4bda690abec912b3b7b228b01fb6819a  phase1\n",
       "4  eb623ac4b10df96835921edabbde2951  c1a05bdfc88a73bf2830e705b2f39dbb  phase1"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "edges_df = pd.read_csv(link_p1_path)\n",
    "print(edges_df.shape)\n",
    "edges_df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Join点列表和边列表以生成从0开始的边列表\n",
    "\n",
    "DGL默认节点是从0开始，并以最大的ID为容量构建Graph，因此这里我们先构建从0开始的边列表。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(29168650, 10)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>paper_id_x</th>\n",
       "      <th>reference_paper_id</th>\n",
       "      <th>phase</th>\n",
       "      <th>node_idx_x</th>\n",
       "      <th>Label_x</th>\n",
       "      <th>Split_ID_x</th>\n",
       "      <th>node_idx_y</th>\n",
       "      <th>paper_id_y</th>\n",
       "      <th>Label_y</th>\n",
       "      <th>Split_ID_y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>f10da75ad1eaf16eb2ffe0d85b76b332</td>\n",
       "      <td>711ef25bdb2c2421c0131af77b3ede1d</td>\n",
       "      <td>phase1</td>\n",
       "      <td>529879</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2364950</td>\n",
       "      <td>711ef25bdb2c2421c0131af77b3ede1d</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>9ac5a4327bd4f3dcb424c93ca9b84087</td>\n",
       "      <td>2d91c73304c5e8a94a0e5b4956093f71</td>\n",
       "      <td>phase1</td>\n",
       "      <td>410481</td>\n",
       "      <td>D</td>\n",
       "      <td>0</td>\n",
       "      <td>384023</td>\n",
       "      <td>2d91c73304c5e8a94a0e5b4956093f71</td>\n",
       "      <td>K</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9d91bfd4703e55dd814dfffb3d63fc33</td>\n",
       "      <td>33d4fdfe3967a1ffde9311bfe6827ef9</td>\n",
       "      <td>phase1</td>\n",
       "      <td>2196044</td>\n",
       "      <td>D</td>\n",
       "      <td>0</td>\n",
       "      <td>1895619</td>\n",
       "      <td>33d4fdfe3967a1ffde9311bfe6827ef9</td>\n",
       "      <td>N</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>e1bdbce05528952ed6579795373782d4</td>\n",
       "      <td>4bda690abec912b3b7b228b01fb6819a</td>\n",
       "      <td>phase1</td>\n",
       "      <td>2545623</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2175977</td>\n",
       "      <td>4bda690abec912b3b7b228b01fb6819a</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                         paper_id_x                reference_paper_id   phase  \\\n",
       "0  f10da75ad1eaf16eb2ffe0d85b76b332  711ef25bdb2c2421c0131af77b3ede1d  phase1   \n",
       "1  9ac5a4327bd4f3dcb424c93ca9b84087  2d91c73304c5e8a94a0e5b4956093f71  phase1   \n",
       "2  9d91bfd4703e55dd814dfffb3d63fc33  33d4fdfe3967a1ffde9311bfe6827ef9  phase1   \n",
       "3  e1bdbce05528952ed6579795373782d4  4bda690abec912b3b7b228b01fb6819a  phase1   \n",
       "\n",
       "   node_idx_x Label_x  Split_ID_x  node_idx_y  \\\n",
       "0      529879     NaN           0     2364950   \n",
       "1      410481       D           0      384023   \n",
       "2     2196044       D           0     1895619   \n",
       "3     2545623     NaN           0     2175977   \n",
       "\n",
       "                         paper_id_y Label_y  Split_ID_y  \n",
       "0  711ef25bdb2c2421c0131af77b3ede1d     NaN           0  \n",
       "1  2d91c73304c5e8a94a0e5b4956093f71       K           0  \n",
       "2  33d4fdfe3967a1ffde9311bfe6827ef9       N           0  \n",
       "3  4bda690abec912b3b7b228b01fb6819a     NaN           0  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Merge paper_id列\n",
    "edges = edges_df.merge(nodes_df, on='paper_id', how='left')\n",
    "# Merge reference_paper_id列\n",
    "edges = edges.merge(nodes_df, left_on='reference_paper_id', right_on='paper_id', how='left')\n",
    "print(edges.shape)\n",
    "edges.head(4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 修改node_idx_* 列的名称作为新的node id，并只保留需要的列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>src_nid</th>\n",
       "      <th>dst_nid</th>\n",
       "      <th>paper_id</th>\n",
       "      <th>reference_paper_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>529879</td>\n",
       "      <td>2364950</td>\n",
       "      <td>f10da75ad1eaf16eb2ffe0d85b76b332</td>\n",
       "      <td>711ef25bdb2c2421c0131af77b3ede1d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>410481</td>\n",
       "      <td>384023</td>\n",
       "      <td>9ac5a4327bd4f3dcb424c93ca9b84087</td>\n",
       "      <td>2d91c73304c5e8a94a0e5b4956093f71</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2196044</td>\n",
       "      <td>1895619</td>\n",
       "      <td>9d91bfd4703e55dd814dfffb3d63fc33</td>\n",
       "      <td>33d4fdfe3967a1ffde9311bfe6827ef9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2545623</td>\n",
       "      <td>2175977</td>\n",
       "      <td>e1bdbce05528952ed6579795373782d4</td>\n",
       "      <td>4bda690abec912b3b7b228b01fb6819a</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   src_nid  dst_nid                          paper_id  \\\n",
       "0   529879  2364950  f10da75ad1eaf16eb2ffe0d85b76b332   \n",
       "1   410481   384023  9ac5a4327bd4f3dcb424c93ca9b84087   \n",
       "2  2196044  1895619  9d91bfd4703e55dd814dfffb3d63fc33   \n",
       "3  2545623  2175977  e1bdbce05528952ed6579795373782d4   \n",
       "\n",
       "                 reference_paper_id  \n",
       "0  711ef25bdb2c2421c0131af77b3ede1d  \n",
       "1  2d91c73304c5e8a94a0e5b4956093f71  \n",
       "2  33d4fdfe3967a1ffde9311bfe6827ef9  \n",
       "3  4bda690abec912b3b7b228b01fb6819a  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "edges.rename(columns={'paper_id_x': 'paper_id', 'node_idx_x':'src_nid', 'node_idx_y':'dst_nid'}, inplace=True)\n",
    "edges = edges[['src_nid', 'dst_nid', 'paper_id', 'reference_paper_id']]\n",
    "edges.head(4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 构建DGL的Graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 讲源节点和目标节点转换成Numpy的NDArray\n",
    "src_nid = edges.src_nid.to_numpy()\n",
    "dst_nid = edges.dst_nid.to_numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Graph(num_nodes=3655452, num_edges=29168650,\n",
      "      ndata_schemes={}\n",
      "      edata_schemes={})\n"
     ]
    }
   ],
   "source": [
    "# 构建一个DGL的graph\n",
    "graph = dgl.graph((src_nid, dst_nid))\n",
    "print(graph)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存Graph为二进制格式方便后面建模时的快速读取\n",
    "graph_path = os.path.join(base_path, publish_path, 'graph.bin')\n",
    "dgl.data.utils.save_graphs(graph_path, [graph])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:dgl]",
   "language": "python",
   "name": "conda-env-dgl-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
